package org.jeecg.modules.webcrawler.job;

import java.util.Date;
import java.util.List;
import java.util.Random;

import org.apache.commons.lang.StringUtils;
import org.jeecg.common.util.DateUtils;
import org.jeecg.modules.webcrawler.entity.WebCrawlerWord;
import org.jeecg.modules.webcrawler.util.WebCrawlerCacheUtils;
import org.jeecg.modules.webcrawler.util.bloomfilter.BloomFilterHelper;
import org.jeecg.modules.webcrawler.util.bloomfilter.BloomRedisService;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.quartz.Job;
import org.quartz.JobExecutionContext;
import org.quartz.JobExecutionException;
import org.springframework.beans.factory.annotation.Autowired;

import lombok.extern.slf4j.Slf4j;

/**
 * 中国警察网 爬虫
 * 
 * @author Scott
 */
@Slf4j
public class JingChaWangJob implements Job {
	
	@Autowired
    private BloomRedisService redisService;

    @Autowired
    private BloomFilterHelper bloomFilterHelper;
	
	/**
	 * 若参数变量名修改 QuartzJobController中也需对应修改
	 */
	private String parameter;

	public void setParameter(String parameter) {
		this.parameter = parameter;
	}
	
	@Override
	public void execute(JobExecutionContext jobExecutionContext) throws JobExecutionException {
		log.info(String.format(" 中国警察网:" + DateUtils.getTimestamp()));
		 //List<WebCrawlerWord> wordList = WebCrawlerCacheUtils.queryWordList();
		//String today = DateUtils.getDate("yyyy-MM-dd");
//		List<WebCrawlerWord> wordList = WebCrawlerCacheUtils.queryWordList();
//		for (WebCrawlerWord word : wordList) {
//			 if(word.getSort() == 2) {//经开区
//				get_list(word.getTitle(), "1", 101);
//			}
//		}
		get_list("邯郸", "1", 1);	//公安
	}
	
	
	/**
	 * 中国警察网
	 */
	public  void get_list(String keyword,String page,int type){
		//整个html内容
		Document doc;
		int errcount = 0;
		try {
			
			 
			Connection conn = Jsoup.connect("http://sousuo.cpd.com.cn/was5/web/search?page="+page+"&channelid=290934&searchword="+keyword+"&orderby=RELEVANCE&token=76.1548987996878.78&perpage=10&outlinepage=10&searchscope=&timescope=&timescopecolumn=&orderby=RELEVANCE&andsen=&total=&orsen=&exclude=").timeout(5000);
			conn.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
			conn.header("Accept-Encoding", "gzip, deflate, sdch");
			conn.header("Accept-Language", "zh-CN,zh;q=0.8");
			conn.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36");
			
		    
		       
			
			doc = conn.get();
			//打印html文档的<title>内容
			String docStr = doc.toString();  
			String str = new String(docStr.getBytes("ISO8859-1"), "UTF-8");  
			Document document = Jsoup.parse(str);
			//System.out.println(doc.html());
			String name = doc.getElementsByTag("title").text();
			log.info("***************************************"+keyword+"*****第"+page+"页("+name+")********************************************");
			Elements list = doc.select("div#column1 table li");
			for (Element info : list) {
				String title = info.select("div a").first().text();
				String url = info.select("div a").first().attr("href");
				String time = info.getElementsByClass("pubtime").text().replace(".", "-");
				if(StringUtils.isNotEmpty(url)) {		//爬虫过滤重复url
					if(redisService.includeByBloomFilter(bloomFilterHelper, "sousuo.cpd.com.cn"+type, url)){  //url已存在
						errcount++;
					}else {
						redisService.addByBloomFilter(bloomFilterHelper, "sousuo.cpd.com.cn"+type, url);
						if(WebCrawlerCacheUtils.getTotalCount(url, type) > 0) {
							errcount++;
							break;
						}else {
							Date date = DateUtils.str2Date(time, DateUtils.datetimeFormat);
							WebCrawlerCacheUtils.addArticle(title, url, date, type, "中国警察网");
						}
					}
				}
				log.info(title);
				log.info(url);
				log.info(time);
				log.info("----------------------------------------重复次数"+errcount+"--------------------------------------------------");
			}

			//查询分页列表
			page = doc.select("div#outlinebar span").next().text();
			if(StringUtils.isNotEmpty(page) && errcount < 8) {
				//get_list(keyword, page, type);
			}
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} // 设置连接超时时间 
		
	}
	
	public static void main(String[] args) {
		new JingChaWangJob().get_list("邯郸", "1", 1);
	}
}
